In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chisquare
from warnings import warn
from functools import lru_cache
from scipy.special import gammaln
from math import lgamma, log
from pgmpy.models import BayesianModel
from functools import partial 

from IPython.display import Markdown, display
import networkx as nx
import pylab as plt
def printmd(string):
    display(Markdown(string))

Scoring for Structure learning DAG

Commonly used scoring functions to measure the fit between model and data are Bayesian Dirichlet scores

Scoring functions for learning Bayesian networks

  • BDeu
  • K2
  • Bayesian Information Criterion
In [2]:
# ref: https://pgmpy.org/index.html

class BaseEstimator(object):
    def __init__(self, data=None, state_names=None, complete_samples_only=True):
        self.data = data
        if self.data is not None:
            self.complete_samples_only = complete_samples_only
            self.variables = list(data.columns.values)
            if not isinstance(state_names, dict):
                self.state_names = {
                    var: self._collect_state_names(var) for var in self.variables
                }
            else:
                self.state_names = dict()
                for var in self.variables:
                    if var in state_names:
                        if not set(self._collect_state_names(var)) <= set(
                            state_names[var]
                        ):
                            raise ValueError(
                                f"Data contains unexpected states for variable: {var}."
                            )
                        self.state_names[var] = state_names[var]
                    else:
                        self.state_names[var] = self._collect_state_names(var)
                        
    def _collect_state_names(self, variable):
        "Return a list of states that the variable takes in the data"
        states = sorted(list(self.data.loc[:, variable].dropna().unique()))
        return states
    
    def convert_args_tuple(func):
        def _convert_param_to_tuples(
            obj, variable, parents=tuple(), complete_samples_only=None
        ):
            parents = tuple(parents)
            return func(obj, variable, parents, complete_samples_only)

        return _convert_param_to_tuples
    
    @convert_args_tuple
    @lru_cache(maxsize=2048)
    def state_counts(self, variable, parents=[], complete_samples_only=None):
        """
        Return counts how often each state of 'variable' occurred in the data.
        If a list of parents is provided, counting is done conditionally
        for each state configuration of the parents.
        Returns
        -------
        state_counts: pandas.DataFrame
            Table with state counts for 'variable'
        """
        parents = list(parents)

        # default for how to deal with missing data can be set in class constructor
        if complete_samples_only is None:
            complete_samples_only = self.complete_samples_only
        # ignores either any row containing NaN, or only those where the variable or its parents is NaN
        data = (
            self.data.dropna()
            if complete_samples_only
            else self.data.dropna(subset=[variable] + parents)
        )

        if not parents:
            # count how often each state of 'variable' occured
            state_count_data = data.loc[:, variable].value_counts()
            state_counts = (
                state_count_data.reindex(self.state_names[variable])
                .fillna(0)
                .to_frame()
            )

        else:
            parents_states = [self.state_names[parent] for parent in parents]
            # count how often each state of 'variable' occured, conditional on parents' states
            state_count_data = (
                data.groupby([variable] + parents).size().unstack(parents)
            )
            if not isinstance(state_count_data.columns, pd.MultiIndex):
                state_count_data.columns = pd.MultiIndex.from_arrays(
                    [state_count_data.columns]
                )
                
            row_index = self.state_names[variable]
            column_index = pd.MultiIndex.from_product(parents_states, names=parents)
            state_counts = state_count_data.reindex(
                index=row_index, columns=column_index
            ).fillna(0)

        return state_counts
    
In [3]:
# Importing data
GME_stock_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/GME_stock.csv')
GME_stock_df[1:] = GME_stock_df[1:].round()
GME_stock_df = GME_stock_df[GME_stock_df.date>'2011-01-01'].round(1)
GME_stock_df['date'] = pd.to_datetime(GME_stock_df['date'])
GME_stock_df.set_index('date',inplace=True)
display(GME_stock_df.head())
# Plotting the open, close, high, low, volume and adjusted close value for the term of 12 months
fig = px.line(GME_stock_df, x=GME_stock_df.index, y=GME_stock_df.columns,
              title='Plot of values for a 12 month period')
fig.update_xaxes(
    dtick="M12",
    tickformat="%b\n%Y")


fig.update_xaxes(rangeslider_visible=True)
fig.show()
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
In [4]:
# Importing data Tatanic 
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())
age portembarked fare numparentschildren passengerclass sex numsiblings survived
0 1 1 1 1 1 1 1 1
1 2 2 1 1 2 2 1 2
2 1 1 1 1 1 2 1 2
3 2 1 1 1 2 2 1 2
4 2 1 1 1 1 1 1 1

Bayesian scoring functions

* Compute the posteriorprobability distribution, starting from a prior probability distribution on the possible networks B, conditioned to data D, that is,P(B|D).

* The best network is the one that maximizes the posterior probability.

* For comparing the BNs, can be translated to comparsion of P(B|D) (Larger P(B|D) is better).

Types of Bayesian scoring functions

  1. BDeu
  2. K2 ## Notation ### $r_i : number of states of the finite random variable X_i$ ### $q_i : number of number of possible configurations of the parent set \prod_{X_i} of X_i $ ## =========================================================================================

1. BD(K2 scoring) -> BDe -> BDeu Score

BD: Bayesian Dirichlet

Assumptions

  • Assumption 1. Multinomial sample: Dirichlet distribution is Multinomial distribution's conjugate prior.
  • Assumption 2. Dirichlet: for all parameters from given DAG G, are follow Dirichlet distribution (repeat from 1)
  • Assumption 3. Parameter independence: for given DAG G where (P(G)>0):
    1. global parameter independence: P($\Theta_G$|G) = $\prod_i$P($\Theta_i$|G) (features to features)
    2. local parameter independence: P($\Theta_i$|G) = $\prod_j$P($\Theta_{ij}$|G) $\forall i$ (sample to sample)
  • Assumption 4. Parameter modularity: Given 2 DAG $G_1$ and $G_2$, if $X_i$ has same parents in $G_1$ and $G_2$ Then P($\Theta_{ij}$|$G_1$) = P($\Theta_{ij}$|$G_2$) (paramters values wont change due to different graph setup <- repeat in Assumption 3 Parameter independence) ### BD score func (log ease for computation) image info
  • $\eta_{ijk}$ are the hyperparameters for the Dirichlet prior distributions of the parameters given the network structure
  • Computationally impossible: Need to compute all hyperparameters $\eta_{ijk}$ (pseudo_counts) ### K2 score func (particular case of BD)
  • from BD score: $\eta_{ij} = \sum^{r_i}_{k=1}\eta_{ijk}$
  • $\Gamma(.)$ is the function Gamm where $\Gamma(c)=\int_0^\infty \mathrm{e}^{-u}\mathrm{u}^{c-1}du$
  • if c is an integer: $\Gamma(c)=(c-1)!$
  • let all hyperparameters $\eta_{ijk} = 1$
  • log(p(G)): it is quite common to assume a uniform distribution ([1] Koller & Friedman, Probabilistic Graphical Models - Principles and Techniques, 2009 Section 18.3.6.1 (esp. page 804)) image info image info
In [5]:
class K2Score(BaseEstimator):
    def __init__(self, data, **kwargs):
        super(K2Score, self).__init__(data, **kwargs)
        
    def K2_score_single(self, variable, parents, is_print=False):
        var_states = self.state_names[variable]
        var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
        state_counts = self.state_counts(variable, parents)
        num_parents_states = float(state_counts.shape[1])
        counts = np.asarray(state_counts)
        
        if is_print:
            printmd(f'r_i = :{var_cardinality}')
            printmd(f'q_i = :{num_parents_states}')
            printmd('state_counts($N_{ijk}$):'+str(counts))
            display(state_counts.head(5))
            print('-'*100)
            
        log_gamma_counts = np.zeros_like(counts, dtype=np.float_)
        
        # Compute log(gamma(counts + 1))
        gammaln(counts + 1, out=log_gamma_counts)
        if is_print: 
            printmd("Let $\eta_{ijk} = 1$")
            printmd("$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ = ")
            print(f"log_gamma_counts = \n {log_gamma_counts}")
            print('-'*100)
        
        # Compute the log-gamma conditional sample size
        log_gamma_conds = np.sum(counts, axis=0, dtype=np.float_)
        gammaln(log_gamma_conds + var_cardinality, out=log_gamma_conds)
        if is_print: 
            printmd("$N_{ij} = \sum_k(N_{ijk})$")
            printmd("$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$")
            printmd("$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ = ")
            print(f"log_gamma_conds(-ve sign will be subtract in score computation) = \n {log_gamma_conds}")
            print('-'*100)
        
        # Compute the log P(G)
        log_PG = num_parents_states * lgamma(var_cardinality)
        if is_print: 
            print(f"num_parents_states = {num_parents_states}")
            print(f"var_cardinality = {var_cardinality}")
            printmd("$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $")
            print(f"log(P(G)) = num_parents_states * lgamma(var_cardinality) = {log_PG}") 
            print('-'*100)
        score = (
            np.sum(log_gamma_counts)
            - np.sum(log_gamma_conds)
            + num_parents_states * lgamma(var_cardinality)
        )
        if is_print: 
            print(f"np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:")
        return score
    
    def score(self, model,is_print=False):
        score = 0
        for node in model.nodes():
            if is_print:
                printmd(f"## node: {node}")
                printmd(f"### parent: {list(model.predecessors(node))}")
                printmd(f"score: {self.K2_score_single(node, model.predecessors(node),is_print=is_print)}")
                print(f"="*100)
            score += self.K2_score_single(node, model.predecessors(node))
        return score
In [6]:
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())

# making a BN
BN_1 = BayesianModel([['fare','age'], ['age','passengerclass'],['passengerclass','portembarked'], ['portembarked','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()

K2Score(simple_df).score(BN_1,is_print=True)
age portembarked fare numparentschildren passengerclass sex numsiblings survived
0 1 1 1 1 1 1 1 1
1 2 2 1 1 2 2 1 2
2 1 1 1 1 1 2 1 2
3 2 1 1 1 2 2 1 2
4 2 1 1 1 1 1 1 1

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[869] [ 17] [ 3]]

fare
1 869
2 17
3 3
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[5.01612388e+03]
 [3.35050735e+01]
 [1.79175947e+00]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [5165.29408915]
----------------------------------------------------------------------------------------------------
num_parents_states = 1.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 0.693147180559945
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -113.18022720424977

====================================================================================================

node: age

parent: ['fare']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[311. 8. 0.] [510. 8. 3.] [ 48. 1. 0.]]

fare 1 2 3
age
1 311.0 8.0 0.0
2 510.0 8.0 3.0
3 48.0 1.0 0.0
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[1.47786470e+03 1.06046029e+01 0.00000000e+00]
 [2.67358578e+03 1.06046029e+01 1.79175947e+00]
 [1.40673924e+02 0.00000000e+00 0.00000000e+00]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [5.02966202e+03 3.93398842e+01 4.78749174e+00]
----------------------------------------------------------------------------------------------------
num_parents_states = 3.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 2.079441541679835
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -756.5845864600848

====================================================================================================

node: passengerclass

parent: ['age']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[208 276 7] [ 44 140 30] [ 67 105 12]]

age 1 2 3
passengerclass
1 208 276 7
2 44 140 30
3 67 105 12
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[ 905.79602879 1278.96007984    8.52516136]
 [ 125.31727115  555.22029415   74.65823635]
 [ 217.73693411  386.91254912   19.9872145 ]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [1535.43751922 2754.80999431  152.40959258]
----------------------------------------------------------------------------------------------------
num_parents_states = 3.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 2.079441541679835
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -867.4638952091083

====================================================================================================

node: portembarked

parent: ['passengerclass']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[353 127 164] [ 66 85 17] [ 72 2 3]]

passengerclass 1 2 3
portembarked
1 353 127 164
2 66 85 17
3 72 2 3
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[1.72171563e+03 4.91553448e+02 6.75847474e+02]
 [2.13532241e+02 2.95766601e+02 3.35050735e+01]
 [2.38978390e+02 6.93147181e-01 1.79175947e+00]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [2567.87038496  948.6670996   789.52114121]
----------------------------------------------------------------------------------------------------
num_parents_states = 3.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 2.079441541679835
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -630.5954167217334

====================================================================================================

node: survived

parent: ['portembarked']

r_i = :2

q_i = :3.0

statecounts($N{ijk}$):[[427 75 47] [217 93 30]]

portembarked 1 2 3
survived
1 427 75 47
2 217 93 30
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[2163.19429935  251.89040221  136.80272264]
 [ 954.04699695  331.7178872    74.65823635]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [3531.82014722  701.43726381  264.9216498 ]
----------------------------------------------------------------------------------------------------
num_parents_states = 3.0
var_cardinality = 2

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 0.0
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -585.8685161373155

====================================================================================================
Out[6]:
-2953.6926417324917
In [7]:
# making a BN
BN_2 = BayesianModel([['fare','passengerclass'], ['passengerclass','survived'],['age','survived'], ['portembarked','survived']])
nx.draw(BN_2, with_labels=True,node_size=3e3,width=3)
plt.show()

K2Score(simple_df).score(BN_2,is_print=True)

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[869] [ 17] [ 3]]

fare
1 869
2 17
3 3
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[5.01612388e+03]
 [3.35050735e+01]
 [1.79175947e+00]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [5165.29408915]
----------------------------------------------------------------------------------------------------
num_parents_states = 1.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 0.693147180559945
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -113.18022720424977

====================================================================================================

node: passengerclass

parent: ['fare']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[491. 0. 0.] [194. 17. 3.] [184. 0. 0.]]

fare 1 2 3
passengerclass
1 491.0 0.0 0.0
2 194.0 17.0 3.0
3 184.0 0.0 0.0
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[2.55547140e+03 0.00000000e+00 0.00000000e+00]
 [8.31517780e+02 3.35050735e+01 1.79175947e+00]
 [7.79075039e+02 0.00000000e+00 0.00000000e+00]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [5.02966202e+03 3.93398842e+01 4.78749174e+00]
----------------------------------------------------------------------------------------------------
num_parents_states = 3.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 2.079441541679835
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -870.348902704644

====================================================================================================

node: survived

parent: ['passengerclass', 'age', 'portembarked']

r_i = :2

q_i = :27.0

statecounts($N{ijk}$):[[130. 12. 9. 152. 29. 34. 4. 0. 2. 5. 4. 0. 35. 16.

1.  13.   6.   0.  27.   2.   0.  53.   6.   0.   8.   0.   1.]

[ 36. 16. 5. 30. 9. 22. 1. 0. 0. 18. 17. 0. 52. 35.

1.   4.   7.   0.  32.   6.   0.  41.   3.   2.   3.   0.   0.]]
passengerclass 1 2 3
age 1 2 3 1 ... 3 1 2 3
portembarked 1 2 3 1 2 3 1 2 3 1 ... 3 1 2 3 1 2 3 1 2 3
survived
1 130.0 12.0 9.0 152.0 29.0 34.0 4.0 0.0 2.0 5.0 ... 0.0 27.0 2.0 0.0 53.0 6.0 0.0 8.0 0.0 1.0
2 36.0 16.0 5.0 30.0 9.0 22.0 1.0 0.0 0.0 18.0 ... 0.0 32.0 6.0 0.0 41.0 3.0 2.0 3.0 0.0 0.0

2 rows × 27 columns

----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[506.13282534  19.9872145   12.80182748 615.06126621  71.25703897
   88.58082754   3.17805383   0.           0.69314718   4.78749174
    3.17805383   0.          92.1361756   30.67186011   0.
   22.55216385   6.57925121   0.          64.55753863   0.69314718
    0.         160.33112822   6.57925121   0.          10.6046029
    0.           0.        ]
 [ 95.71969454  30.67186011   4.78749174  74.65823635  12.80182748
   48.47118135   0.           0.           0.          36.39544521
   33.50507345   0.         156.3608363   92.1361756    0.
    3.17805383   8.52516136   0.          81.55795946   6.57925121
    0.         114.03421178   1.79175947   0.69314718   1.79175947
    0.           0.        ]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [6.91183401e+02 7.12570390e+01 2.78992714e+01 7.73860103e+02
 1.06631760e+02 1.76395848e+02 6.57925121e+00 0.00000000e+00
 1.79175947e+00 5.47847294e+01 4.84711814e+01 0.00000000e+00
 3.09164194e+02 1.56360836e+02 1.79175947e+00 3.63954452e+01
 2.51912212e+01 0.00000000e+00 1.88628173e+02 1.28018275e+01
 0.00000000e+00 3.40815059e+02 1.51044126e+01 1.79175947e+00
 1.99872145e+01 0.00000000e+00 6.93147181e-01]
----------------------------------------------------------------------------------------------------
num_parents_states = 27.0
var_cardinality = 2

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 0.0
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -543.5574023242825

====================================================================================================

node: age

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[319] [521] [ 49]]

age
1 319
2 521
3 49
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[1523.89775711]
 [2742.29274526]
 [ 144.56574395]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [5165.29408915]
----------------------------------------------------------------------------------------------------
num_parents_states = 1.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 0.693147180559945
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -753.8446956621158

====================================================================================================

node: portembarked

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[644] [168] [ 77]]

portembarked
1 644
2 168
3 77
----------------------------------------------------------------------------------------------------

Let $\eta_{ijk} = 1$

$log(\Gamma(N_{ijk} + \eta_{ijk}) / \Gamma(\eta_{ijk}))$ = $log(\Gamma(N_{ijk} + 1) / 1)$ =

log_gamma_counts = 
 [[3525.35089691]
 [ 696.30736509]
 [ 260.56494097]]
----------------------------------------------------------------------------------------------------

$N_{ij} = \sum_k(N_{ijk})$

$\eta_{ij} = \sum_k(\eta_{ijk}) = k * 1$

$log(\eta_{ij}) / \Gamma(N_{ij} + \eta_{ij}))$ = $ -log(\Gamma(N_{ij} + k))$ =

log_gamma_conds(-ve sign will be subtract in score computation) = 
 [5165.29408915]
----------------------------------------------------------------------------------------------------
num_parents_states = 1.0
var_cardinality = 3

$P(G) = no\_of\_paremts * log(\Gamma(no\_of\_states)) $

log(P(G)) = num_parents_states * lgamma(var_cardinality) = 0.693147180559945
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + log_PG:

score: -682.3777389980253

====================================================================================================
Out[7]:
-2963.308966893317
In [8]:
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'], 
                        ['high_price','volume'], 
                        ['low_price','volume'],
                        ['open_price','high_price'],
                        ['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()

K2Score(GME_stock_df).score(BN_GMS,is_print=False)
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
Out[8]:
-45277.152952688666

BDe: Bayesian Dirichlet (likelihood-equivalence ) -> BDeu

  • from BD: image info
    • Problem:Computationally impossible: Need to compute all hyperparameters $\eta_{ijk}$ (pseudo_counts)

Assumptions (aside of BD Assumptions 1~4)

  • Assumption 5. Likelihood equivalence
    • Definition 1. Equivalent directed acyclic graphs:
      • Two directed acyclic graphs are equivalent if they can encode the same joint probability distributions.
    • Assumption:
      • Given 2 DAG $G_1$ and $G_2$, such that P($G_1$)>0 and P($G_2$)>0, if $G_1$ and $G_2$ are equivalence then P($\Theta_D$|$G_1$) = P($\Theta_D$|$G_2$)
  • Assumption 6. Structure possibility
    • The skeleton of any DAG is the undirected graph resulting from ignoring the directionality of every edge.
    • Complete directed acyclic graph = skeleton is complete (regardless of the direction)
    • Assumption:
      • For any complete directed acyclic graph G, we have that P(G) > 0

BDe computation

image info

  • $\eta_{ijk}$ can be solved as
    • $\eta_{ijk} = N' \times p(x_{ik},w_{ij}|G_0)$
  • where $p(.|G0)$ represents a probability distribution associated with a prior Bayesian network G0
  • $N'$ is a parameter representing the equivalent sample size.(expresses the strength of our belief in the prior distribution)
  • Computationally impossible: (pseudo_counts)
    • Better than BD: (may need to discuss)
      • BD: $\eta_{ijk}$
      • BDe: $x_{ik},w_{ij}$
    • Still Need to know $p(x_{ik},w_{ij}|G_0)$ for all i,j,k

BDeu score

  • $p(x_{ik},w_{ij}|G_0) = \frac{1}{r_{i}q_{i}}$
  • the prior network assigns a uniform probability to each configuration
    • $p(x_{ik},w_{ij}|G_0)$ = $\frac{1}{r_{i}q_{i} - 0}$ $for$ $x \in [0, r_{i}q_{i}]$ else $0$ image info
  • This score only depends on one parameter, the equivalent sample size N'!
  • Since there are no generally accepted rule to determine the hyperparameters $N'_{x1...xn}$ , there is no particular good candidate for N'
  • BDeu score is very sensitive with respect to the equivalent sample size N', so need to try with many different setings

Implementation

let $\alpha_i = \frac{N'}{q_{i}}$

let $\beta_i = \frac{N'}{r_{i}q_{i}}$

$g_{BDeu}(G:D) = log(p(G))+\sum^n_{i=1}[\sum^{q_i}_{j=1}[log(\frac{\Gamma(\alpha_i)}{\Gamma(N_{ij}+\alpha_i)})+\sum^{r_i}_{k=1}log(\frac{\Gamma(N_{ijk}+\beta_i)}{\Gamma(\beta_i)})]]$

$g_{BDeu}(G:D) = log(p(G))+\sum^n_{i=1}[\sum^{q_i}_{j=1}[log(\Gamma(\alpha_i)) - log(\Gamma(N_{ij}+\alpha_i)) + \sum^{r_i}_{k=1}log(\Gamma(N_{ijk}+\beta_i)) - log(\Gamma(\beta_i))]]$

$g_{BDeu}(G:D) = log(p(G))+\sum^n_{i=1}[\sum^{q_i}_{j=1}[- log(\Gamma(N_{ij}+\alpha_i)) + \sum^{r_i}_{k=1}log(\Gamma(N_{ijk}+\beta_i)) )] + {q_i} \times log(\Gamma(\alpha_i)) - {r_i}{q_i} \times log(\Gamma(\beta_i)]$

In [9]:
class BDeuScore(BaseEstimator):
    def __init__(self, data, equivalent_sample_size=10, **kwargs):
        self.equivalent_sample_size = equivalent_sample_size
        super(BDeuScore, self).__init__(data, **kwargs)
        
    def BDeu_score_single(self, variable, parents, is_print=False):
        var_states = self.state_names[variable]
        var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
        state_counts = self.state_counts(variable, parents)
        num_parents_states = float(state_counts.shape[1])
        counts = np.asarray(state_counts)
        
        if is_print:
            printmd(f'r_i = :{var_cardinality}')
            printmd(f'q_i = :{num_parents_states}')
            printmd('state_counts($N_{ijk}$):'+str(counts))
            display(state_counts.head(2))
            print('-'*100)
            
        log_gamma_counts = np.zeros_like(counts, dtype=np.float_)
        
        
        alpha = self.equivalent_sample_size / num_parents_states
        beta = self.equivalent_sample_size / counts.size
        if is_print:
            printmd("##### Let $alpha = {N'}/q_{i}$ = " + str(alpha))
            printmd("##### Let $ beta = {N'}/(r_{i}q_{i})$ = " + str(beta))
            print('-'*100)
        
        
        # Compute log(gamma(counts + beta))
        gammaln(counts + beta, out=log_gamma_counts)
        if is_print: 
            printmd("$log(\Gamma(N_{ijk} + beta))$ = ")
            print(f"log_gamma_counts = \n {log_gamma_counts}")
            print('-'*100)
        
        # Compute the log-gamma conditional sample size
        log_gamma_conds = np.sum(counts, axis=0, dtype=np.float_)
        gammaln(log_gamma_conds + alpha, out=log_gamma_conds)
        if is_print: 
            printmd("$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$")
            print(f"log_gamma_conds = \n {log_gamma_conds}")
            print('-'*100)
        
        
        score = (
            np.sum(log_gamma_counts)
            - np.sum(log_gamma_conds)
            + num_parents_states * lgamma(alpha)
            - counts.size * lgamma(beta)
        )
        if is_print: 
            print(f"np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):")
        return score
    
    def score(self, model,is_print=False):
        score = 0
        for node in model.nodes():
            if is_print:
                printmd(f"## node: {node}")
                printmd(f"### parent: {list(model.predecessors(node))}")
                printmd(f"score: {self.BDeu_score_single(node, model.predecessors(node),is_print=is_print)}")
                print(f"="*100)
            score += self.BDeu_score_single(node, model.predecessors(node))
        return score
In [10]:
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())

# making a BN
BN_1 = BayesianModel([['fare','age'], ['age','passengerclass'],['passengerclass','portembarked'], ['portembarked','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()

BDeuScore(simple_df, equivalent_sample_size=10).score(BN_1,is_print=True)
age portembarked fare numparentschildren passengerclass sex numsiblings survived
0 1 1 1 1 1 1 1 1
1 2 2 1 1 2 2 1 2
2 1 1 1 1 1 2 1 2
3 2 1 1 1 2 2 1 2
4 2 1 1 1 1 1 1 1

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[869] [ 17] [ 3]]

fare
1 869
2 17
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 10.0
Let $ beta = {N'}/(r_{i}q_{i})$ = 3.3333333333333335
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[5031.91881941]
 [  40.33289114]
 [   5.3660746 ]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [5212.8718377]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -125.51758993796017

====================================================================================================

node: age

parent: ['fare']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[311. 8. 0.] [510. 8. 3.] [ 48. 1. 0.]]

fare 1 2 3
age
1 311.0 8.0 0.0
2 510.0 8.0 3.0
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.1111111111111112
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[ 1.47850265e+03  1.08431742e+01 -5.44927770e-02]
 [ 2.67427861e+03  1.08431742e+01  1.93306207e+00]
 [ 1.41105338e+02  5.08677387e-02 -5.44927770e-02]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [5031.91881941   40.33289114    5.3660746 ]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -756.6140925123143

====================================================================================================

node: passengerclass

parent: ['age']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[208 276 7] [ 44 140 30] [ 67 105 12]]

age 1 2 3
passengerclass
1 208 276 7
2 44 140 30
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.1111111111111112
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[ 906.38938506 1279.58479231    8.74993934]
 [ 125.7391331   555.7698058    75.03819086]
 [ 218.20504071  387.43024258   20.26837272]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [1537.36202462 2756.89694614  153.72453482]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -867.2528032268308

====================================================================================================

node: portembarked

parent: ['passengerclass']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[353 127 164] [ 66 85 17] [ 72 2 3]]

passengerclass 1 2 3
portembarked
1 353 127 164
2 66 85 17
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.1111111111111112
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[1.72236764e+03 4.92092177e+02 6.76414502e+02]
 [2.13998691e+02 2.96260954e+02 3.38234628e+01]
 [2.39454430e+02 7.98082141e-01 1.93306207e+00]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [2569.93767185  950.45988655  791.26424973]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -630.9630112672578

====================================================================================================

node: survived

parent: ['portembarked']

r_i = :2

q_i = :3.0

statecounts($N{ijk}$):[[427 75 47] [217 93 30]]

portembarked 1 2 3
survived
1 427 75 47
2 217 93 30
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.6666666666666667
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[2167.23345524  254.77609686  139.38121121]
 [ 957.63615077  334.74556906   76.94398348]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [3540.44822374  708.28630012  270.75038335]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -585.0891867143772

====================================================================================================
Out[10]:
-2965.4366836587405
In [11]:
# making a BN
BN_2 = BayesianModel([['fare','passengerclass'], ['passengerclass','survived'],['age','survived'], ['portembarked','survived']])
nx.draw(BN_2, with_labels=True,node_size=3e3,width=3)
plt.show()

BDeuScore(simple_df, equivalent_sample_size=10).score(BN_1,is_print=True)

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[869] [ 17] [ 3]]

fare
1 869
2 17
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 10.0
Let $ beta = {N'}/(r_{i}q_{i})$ = 3.3333333333333335
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[5031.91881941]
 [  40.33289114]
 [   5.3660746 ]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [5212.8718377]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -125.51758993796017

====================================================================================================

node: age

parent: ['fare']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[311. 8. 0.] [510. 8. 3.] [ 48. 1. 0.]]

fare 1 2 3
age
1 311.0 8.0 0.0
2 510.0 8.0 3.0
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.1111111111111112
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[ 1.47850265e+03  1.08431742e+01 -5.44927770e-02]
 [ 2.67427861e+03  1.08431742e+01  1.93306207e+00]
 [ 1.41105338e+02  5.08677387e-02 -5.44927770e-02]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [5031.91881941   40.33289114    5.3660746 ]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -756.6140925123143

====================================================================================================

node: passengerclass

parent: ['age']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[208 276 7] [ 44 140 30] [ 67 105 12]]

age 1 2 3
passengerclass
1 208 276 7
2 44 140 30
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.1111111111111112
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[ 906.38938506 1279.58479231    8.74993934]
 [ 125.7391331   555.7698058    75.03819086]
 [ 218.20504071  387.43024258   20.26837272]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [1537.36202462 2756.89694614  153.72453482]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -867.2528032268308

====================================================================================================

node: portembarked

parent: ['passengerclass']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[353 127 164] [ 66 85 17] [ 72 2 3]]

passengerclass 1 2 3
portembarked
1 353 127 164
2 66 85 17
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.1111111111111112
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[1.72236764e+03 4.92092177e+02 6.76414502e+02]
 [2.13998691e+02 2.96260954e+02 3.38234628e+01]
 [2.39454430e+02 7.98082141e-01 1.93306207e+00]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [2569.93767185  950.45988655  791.26424973]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -630.9630112672578

====================================================================================================

node: survived

parent: ['portembarked']

r_i = :2

q_i = :3.0

statecounts($N{ijk}$):[[427 75 47] [217 93 30]]

portembarked 1 2 3
survived
1 427 75 47
2 217 93 30
----------------------------------------------------------------------------------------------------
Let $alpha = {N'}/q_{i}$ = 3.3333333333333335
Let $ beta = {N'}/(r_{i}q_{i})$ = 1.6666666666666667
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ijk} + beta))$ =

log_gamma_counts = 
 [[2167.23345524  254.77609686  139.38121121]
 [ 957.63615077  334.74556906   76.94398348]]
----------------------------------------------------------------------------------------------------

$log(\Gamma(N_{ij} + alpha))$ = $ log(\Gamma(N_{ij} + k))$

log_gamma_conds = 
 [3540.44822374  708.28630012  270.75038335]
----------------------------------------------------------------------------------------------------
np.sum(log_gamma_counts) - np.sum(log_gamma_conds) + num_parents_states * lgamma(alpha) - counts.size * lgamma(beta):

score: -585.0891867143772

====================================================================================================
Out[11]:
-2965.4366836587405
In [12]:
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'], 
                        ['high_price','volume'], 
                        ['low_price','volume'],
                        ['open_price','high_price'],
                        ['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()

BDeuScore(GME_stock_df, equivalent_sample_size=20).score(BN_GMS,is_print=False)
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
Out[12]:
-56656.75504864646

Information-theoretic scoring functions

2. Shannon source coding theorem. (ref: https://en.wikipedia.org/wiki/Shannon%27s_source_coding_theorem#:~:text=In%20information%20theory%2C%20Shannon's%20source,meaning%20of%20the%20Shannon%20entropy.&text=However%20it%20is%20possible%20to,with%20negligible%20probability%20of%20loss.)

image info

3. Information content of D by B:

a. The size of an optimal code, induced by the distribution B, when encoding D

b. This value can be used to score the BN B. (the best BN is can be compressed close to shannon limit yet not lose info)

image info

c. Gibb’s inequality

image info

  • Given fixed DAG struct of BN B, L(D|B)min achieved when: $\theta{ijk}=\frac{N{ijk}}{N{ij}}$
  • L(T|B) is minimal when Likeihood P(D|B) is maximal
  • parameters of B induces a code compresses D the most is precisely the parameters that maximizes the probability of observing D (maximum likelihood).

4. log-likelihood (LL) score function:(B->G)

image info

a. This phenomenon of overfitting is usually avoided in two different ways:

* limiting the number of parents per network variable.
* penalization factor over LL score:
    * **MDL/BIC**
    * **AIC**
    * **NML**

4. log-likelihood (LL) score function with penalization:

image info

where $C(G)=\sum^n_{i=1}(r_i-1)q_i$ (network complexity)

idea:

ref: Journal of Machine Learning Research - SCORING BAYESIAN NETWORKS USING MUTUAL INFORMATION AND INDEPENDENCE TESTS (2154~2155)

  • the best model is the one that minimizes the sum of the description length of the model and the description length of the data given the model
  • For simple models require shorter description lengths, but the description length of the data given the model increases
  • The minimum description length principle establishes an appropriate trade-off between complexity and precision
  • for BN: the description length includes the length required to represent the network plus the length necessary to represent the data given the network
  • we must store its probability values, and this requires a length which is proportional to the number of free parameters of the factorized joint probability distribution and This number, called network complexity and denoted as C(G) ## =========================================================================================

1. MDL/BIC

BIC: Bayesian Information Criterion / MDL: Minimal Descriptive Length

$f(N) = \frac{1}{2}log(N)$

image info

In [13]:
class BicScore(BaseEstimator):
    def __init__(self, data, **kwargs):
        super(BicScore, self).__init__(data, **kwargs)
        
    def Bic_score_single(self, variable, parents, is_print=False):
        var_states = self.state_names[variable]
        var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
        state_counts = self.state_counts(variable, parents)
        num_parents_states = float(state_counts.shape[1])
        counts = np.asarray(state_counts)
        sample_size = len(self.data)
        
        if is_print:
            printmd(f'r_i = :{var_cardinality}')
            printmd(f'q_i = :{num_parents_states}')
            printmd('state_counts($N_{ijk}$):'+str(counts))
            printmd('sample_size($N$):'+str(sample_size))
            display(state_counts.head(2))
            print('-'*100)
            
        log_likelihoods = np.zeros_like(counts, dtype=np.float_)
        
        # Compute the log-counts
        np.log(counts, out=log_likelihoods, where=counts > 0)
        if is_print: 
            printmd("$log(N_{ijk})(>0)$ = ")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        # Compute the log-conditional sample size
        log_conditionals = np.sum(counts, axis=0, dtype=np.float_)
        np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0)
        if is_print: 
            printmd("$log(N_{ij})(>0)$")
            print(f"{log_conditionals}")
            print('-'*100)
        
        # Compute the log-likelihoods
        log_likelihoods -= log_conditionals
        log_likelihoods *= counts
        if is_print: 
            printmd("log_likelihoods = $N_{ijk} * log(N_{ijk}/N_{ij})$ = $N_{ijk}*[log(N_{ijk}) - log(N_{ij})]$ = ")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        score = np.sum(log_likelihoods)
        score -= 0.5 * log(sample_size) * num_parents_states * (var_cardinality - 1)
        if is_print: 
            printmd("bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        
        return score
    
    def score(self, model,is_print=False):
        score = 0
        for node in model.nodes():
            if is_print:
                printmd(f"## node: {node}")
                printmd(f"### parent: {list(model.predecessors(node))}")
                printmd(f"score: {self.Bic_score_single(node, model.predecessors(node),is_print=is_print)}")
                print(f"="*100)
            score += self.Bic_score_single(node, model.predecessors(node))
        return score
In [14]:
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())

# making a BN
BN_1 = BayesianModel([['fare','passengerclass'], ['age','passengerclass'],['portembarked','passengerclass'], ['passengerclass','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()

BicScore(simple_df).score(BN_1,is_print=True)
age portembarked fare numparentschildren passengerclass sex numsiblings survived
0 1 1 1 1 1 1 1 1
1 2 2 1 1 2 2 1 2
2 1 1 1 1 1 2 1 2
3 2 1 1 1 2 2 1 2
4 2 1 1 1 1 1 1 1

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[869] [ 17] [ 3]]

sample_size($N$):889

fare
1 869
2 17
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[6.76734313]
 [2.83321334]
 [1.09861229]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.79009724]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-19.77332181]
 [-67.26702615]
 [-17.07445484]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-19.77332181]
 [-67.26702615]
 [-17.07445484]]
----------------------------------------------------------------------------------------------------

score: -110.90490003678934

====================================================================================================

node: passengerclass

parent: ['fare', 'age', 'portembarked']

r_i = :3

q_i = :27.0

statecounts($N{ijk}$):[[166. 28. 14. 182. 38. 56. 5. 0. 2. 0. 0. 0. 0. 0.

0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]

[ 19. 17. 0. 84. 43. 2. 16. 13. 0. 4. 4. 0. 3. 5.

0.   1.   0.   0.   0.   0.   0.   0.   3.   0.   0.   0.   0.]

[ 59. 8. 0. 94. 9. 2. 11. 0. 1. 0. 0. 0. 0. 0.

0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]]

sample_size($N$):889

fare 1 2 3
age 1 2 3 1 ... 3 1 2 3
portembarked 1 2 3 1 2 3 1 2 3 1 ... 3 1 2 3 1 2 3 1 2 3
passengerclass
1 166.0 28.0 14.0 182.0 38.0 56.0 5.0 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 19.0 17.0 0.0 84.0 43.0 2.0 16.0 13.0 0.0 4.0 ... 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0

2 rows × 27 columns

----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[5.11198779 3.33220451 2.63905733 5.20400669 3.63758616 4.02535169
  1.60943791 0.         0.69314718 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [2.94443898 2.83321334 0.         4.4308168  3.76120012 0.69314718
  2.77258872 2.56494936 0.         1.38629436 1.38629436 0.
  1.09861229 1.60943791 0.         0.         0.         0.
  0.         0.         0.         0.         1.09861229 0.
  0.         0.         0.        ]
 [4.07753744 2.07944154 0.         4.54329478 2.19722458 0.69314718
  2.39789527 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[5.49716823 3.97029191 2.63905733 5.88610403 4.49980967 4.09434456
 3.4657359  2.56494936 1.09861229 1.38629436 1.38629436 0.
 1.09861229 1.60943791 0.         0.         0.         0.
 0.         0.         0.         0.         1.09861229 0.
 0.         0.         0.        ]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[ -63.93995253  -17.86644729    0.         -124.14171668  -32.7644934
    -3.8636008    -9.28148995   -0.           -0.81093022   -0.
    -0.            0.           -0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.           -0.            0.            0.
     0.            0.        ]
 [ -48.50185568  -19.33033568   -0.         -122.24412754  -31.76021085
    -6.80239476  -11.09035489    0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.
     0.            0.        ]
 [ -83.7582161   -15.12680297   -0.         -126.22406942  -20.72326584
    -6.80239476  -11.74624693   -0.           -1.09861229   -0.
    -0.            0.           -0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.           -0.            0.            0.
     0.            0.        ]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[ -63.93995253  -17.86644729    0.         -124.14171668  -32.7644934
    -3.8636008    -9.28148995   -0.           -0.81093022   -0.
    -0.            0.           -0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.           -0.            0.            0.
     0.            0.        ]
 [ -48.50185568  -19.33033568   -0.         -122.24412754  -31.76021085
    -6.80239476  -11.09035489    0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.
     0.            0.        ]
 [ -83.7582161   -15.12680297   -0.         -126.22406942  -20.72326584
    -6.80239476  -11.74624693   -0.           -1.09861229   -0.
    -0.            0.           -0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.           -0.            0.            0.
     0.            0.        ]]
----------------------------------------------------------------------------------------------------

score: -941.2101439523303

====================================================================================================

node: age

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[319] [521] [ 49]]

sample_size($N$):889

age
1 319
2 521
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[5.7651911 ]
 [6.25575004]
 [3.8918203 ]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.79009724]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-326.94505634]
 [-278.39488795]
 [-142.01556993]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-326.94505634]
 [-278.39488795]
 [-142.01556993]]
----------------------------------------------------------------------------------------------------

score: -754.1456114580849

====================================================================================================

node: portembarked

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[644] [168] [ 77]]

sample_size($N$):889

portembarked
1 644
2 168
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[6.46769873]
 [5.12396398]
 [4.34380542]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.79009724]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-207.62464006]
 [-279.91038703]
 [-188.36446965]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-207.62464006]
 [-279.91038703]
 [-188.36446965]]
----------------------------------------------------------------------------------------------------

score: -682.6895939736901

====================================================================================================

node: survived

parent: ['passengerclass']

r_i = :2

q_i = :3.0

statecounts($N{ijk}$):[[372 80 97] [119 134 87]]

sample_size($N$):889

passengerclass 1 2 3
survived
1 372 80 97
2 119 134 87
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[5.91889385 4.38202663 4.57471098]
 [4.77912349 4.8978398  4.46590812]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.19644413 5.36597602 5.21493576]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-103.24870175  -78.71595043  -62.10180357]
 [-168.66115553  -62.73025282  -65.16540459]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-103.24870175  -78.71595043  -62.10180357]
 [-168.66115553  -62.73025282  -65.16540459]]
----------------------------------------------------------------------------------------------------

score: -550.8084145401181

====================================================================================================
Out[14]:
-3039.7586639610126
In [15]:
# making a BN
BN_2 = BayesianModel([['fare','passengerclass'], ['passengerclass','survived'],['age','survived'], ['portembarked','survived']])
nx.draw(BN_2, with_labels=True,node_size=3e3,width=3)
plt.show()

BicScore(simple_df).score(BN_2,is_print=True)

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[869] [ 17] [ 3]]

sample_size($N$):889

fare
1 869
2 17
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[6.76734313]
 [2.83321334]
 [1.09861229]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.79009724]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-19.77332181]
 [-67.26702615]
 [-17.07445484]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-19.77332181]
 [-67.26702615]
 [-17.07445484]]
----------------------------------------------------------------------------------------------------

score: -110.90490003678934

====================================================================================================

node: passengerclass

parent: ['fare']

r_i = :3

q_i = :3.0

statecounts($N{ijk}$):[[491. 0. 0.] [194. 17. 3.] [184. 0. 0.]]

sample_size($N$):889

fare 1 2 3
passengerclass
1 491.0 0.0 0.0
2 194.0 17.0 3.0
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[6.19644413 0.         0.        ]
 [5.26785816 2.83321334 1.09861229]
 [5.21493576 0.         0.        ]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.76734313 2.83321334 1.09861229]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-280.31140776   -0.           -0.        ]
 [-290.90008344    0.            0.        ]
 [-285.64295565   -0.           -0.        ]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-280.31140776   -0.           -0.        ]
 [-290.90008344    0.            0.        ]
 [-285.64295565   -0.           -0.        ]]
----------------------------------------------------------------------------------------------------

score: -877.224738556719

====================================================================================================

node: survived

parent: ['passengerclass', 'age', 'portembarked']

r_i = :2

q_i = :27.0

statecounts($N{ijk}$):[[130. 12. 9. 152. 29. 34. 4. 0. 2. 5. 4. 0. 35. 16.

1.  13.   6.   0.  27.   2.   0.  53.   6.   0.   8.   0.   1.]

[ 36. 16. 5. 30. 9. 22. 1. 0. 0. 18. 17. 0. 52. 35.

1.   4.   7.   0.  32.   6.   0.  41.   3.   2.   3.   0.   0.]]

sample_size($N$):889

passengerclass 1 2 3
age 1 2 3 1 ... 3 1 2 3
portembarked 1 2 3 1 2 3 1 2 3 1 ... 3 1 2 3 1 2 3 1 2 3
survived
1 130.0 12.0 9.0 152.0 29.0 34.0 4.0 0.0 2.0 5.0 ... 0.0 27.0 2.0 0.0 53.0 6.0 0.0 8.0 0.0 1.0
2 36.0 16.0 5.0 30.0 9.0 22.0 1.0 0.0 0.0 18.0 ... 0.0 32.0 6.0 0.0 41.0 3.0 2.0 3.0 0.0 0.0

2 rows × 27 columns

----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[4.86753445 2.48490665 2.19722458 5.02388052 3.36729583 3.52636052
  1.38629436 0.         0.69314718 1.60943791 1.38629436 0.
  3.55534806 2.77258872 0.         2.56494936 1.79175947 0.
  3.29583687 0.69314718 0.         3.97029191 1.79175947 0.
  2.07944154 0.         0.        ]
 [3.58351894 2.77258872 1.60943791 3.40119738 2.19722458 3.09104245
  0.         0.         0.         2.89037176 2.83321334 0.
  3.95124372 3.55534806 0.         1.38629436 1.94591015 0.
  3.4657359  1.79175947 0.         3.71357207 1.09861229 0.69314718
  1.09861229 0.         0.        ]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[5.11198779 3.33220451 2.63905733 5.20400669 3.63758616 4.02535169
 1.60943791 0.         0.69314718 3.13549422 3.04452244 0.
 4.46590812 3.93182563 0.69314718 2.83321334 2.56494936 0.
 4.07753744 2.07944154 0.         4.54329478 2.19722458 0.69314718
 2.39789527 0.         0.        ]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-31.77893393 -10.16757432  -3.97649477 -27.37917727  -7.83841956
  -16.96569965  -0.89257421   0.           0.          -7.63028152
   -6.63291231   0.         -31.869602   -18.54779057  -0.69314718
   -3.48743183  -4.63913933   0.         -21.1059156   -2.77258872
    0.         -30.36915204  -2.43279065  -0.          -2.54762985
    0.           0.        ]
 [-55.0248786   -8.95385261  -5.14809709 -54.08427916 -12.96325424
  -20.55480322  -1.60943791   0.          -0.          -4.41220424
   -3.59225459   0.         -26.7625488  -13.17671499  -0.69314718
   -5.78767593  -4.33327446   0.         -19.57764932  -1.72609243
    0.         -34.01863134  -3.29583687   0.          -3.89784895
    0.           0.        ]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-31.77893393 -10.16757432  -3.97649477 -27.37917727  -7.83841956
  -16.96569965  -0.89257421   0.           0.          -7.63028152
   -6.63291231   0.         -31.869602   -18.54779057  -0.69314718
   -3.48743183  -4.63913933   0.         -21.1059156   -2.77258872
    0.         -30.36915204  -2.43279065  -0.          -2.54762985
    0.           0.        ]
 [-55.0248786   -8.95385261  -5.14809709 -54.08427916 -12.96325424
  -20.55480322  -1.60943791   0.          -0.          -4.41220424
   -3.59225459   0.         -26.7625488  -13.17671499  -0.69314718
   -5.78767593  -4.33327446   0.         -19.57764932  -1.72609243
    0.         -34.01863134  -3.29583687   0.          -3.89784895
    0.           0.        ]]
----------------------------------------------------------------------------------------------------

score: -603.0060499176219

====================================================================================================

node: age

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[319] [521] [ 49]]

sample_size($N$):889

age
1 319
2 521
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[5.7651911 ]
 [6.25575004]
 [3.8918203 ]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.79009724]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-326.94505634]
 [-278.39488795]
 [-142.01556993]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-326.94505634]
 [-278.39488795]
 [-142.01556993]]
----------------------------------------------------------------------------------------------------

score: -754.1456114580849

====================================================================================================

node: portembarked

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):[[644] [168] [ 77]]

sample_size($N$):889

portembarked
1 644
2 168
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =

[[6.46769873]
 [5.12396398]
 [4.34380542]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$

[6.79009724]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-207.62464006]
 [-279.91038703]
 [-188.36446965]]
----------------------------------------------------------------------------------------------------

bic score = $\sum log\_likelihoods - 0.5*log(sample\_size(N)) * num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$

[[-207.62464006]
 [-279.91038703]
 [-188.36446965]]
----------------------------------------------------------------------------------------------------

score: -682.6895939736901

====================================================================================================
Out[15]:
-3027.970893942905
In [16]:
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'], 
                        ['high_price','volume'], 
                        ['low_price','volume'],
                        ['open_price','high_price'],
                        ['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()

BicScore(GME_stock_df).score(BN_GMS,is_print=False)
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
Out[16]:
-34872223.72144956

2. AIC

AIC: Akaike Information Criterion

$f(N) = 1$

image info

In [17]:
class AicScore(BaseEstimator):
    def __init__(self, data, **kwargs):
        super(AicScore, self).__init__(data, **kwargs)
        
    def Aic_score_single(self, variable, parents, is_print=False):
        var_states = self.state_names[variable]
        var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
        state_counts = self.state_counts(variable, parents)
        num_parents_states = float(state_counts.shape[1])
        counts = np.asarray(state_counts)
        sample_size = len(self.data)
        
        if is_print:
            printmd(f'r_i = :{var_cardinality}')
            printmd(f'q_i = :{num_parents_states}')
            printmd('state_counts($N_{ijk}$):'+str(counts))
            printmd('sample_size($N$):'+str(sample_size))
            display(state_counts.head(2))
            print('-'*100)
            
        log_likelihoods = np.zeros_like(counts, dtype=np.float_)
        
        # Compute the log-counts
        np.log(counts, out=log_likelihoods, where=counts > 0)
        if is_print: 
            printmd("$log(N_{ijk})(>0)$ = ")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        # Compute the log-conditional sample size
        log_conditionals = np.sum(counts, axis=0, dtype=np.float_)
        np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0)
        if is_print: 
            printmd("$log(N_{ij})(>0)$")
            print(f"{log_conditionals}")
            print('-'*100)
        
        # Compute the log-likelihoods
        log_likelihoods -= log_conditionals
        log_likelihoods *= counts
        if is_print: 
            printmd("log_likelihoods = $N_{ijk} * log(N_{ijk}/N_{ij})$ = $N_{ijk}*[log(N_{ijk}) - log(N_{ij})]$ = ")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        score = np.sum(log_likelihoods)
        score -= num_parents_states * (var_cardinality - 1)
        if is_print: 
            printmd("bic score = $\sum log\_likelihoods -  num_parents\_states(q_i)*(var\_cardinality(r_i) - 1)$")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        
        return score
    
    def score(self, model,is_print=False):
        score = 0
        for node in model.nodes():
            if is_print:
                printmd(f"## node: {node}")
                printmd(f"### parent: {list(model.predecessors(node))}")
                printmd(f"score: {self.Aic_score_single(node, model.predecessors(node),is_print=is_print)}")
                print(f"="*100)
            score += self.Aic_score_single(node, model.predecessors(node))
        return score
In [18]:
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'], 
                        ['high_price','volume'], 
                        ['low_price','volume'],
                        ['open_price','high_price'],
                        ['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()

AicScore(GME_stock_df).score(BN_GMS,is_print=False)
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
Out[18]:
-8916378.834636314

2. NML scoring function (not found any implementation online, this implementation in python may be the first)

NML: normalized minimum likelihood

log-likelihood (LL) score function with penalization (General form):

image info image info

Idea: What if we treat $C(G)f(N)$ as a distribution H?

  • Given a set of probability distributions H the encoder thinks that there is one distribution H0 ∈ H that will assign high likelihood (low code length) to the incoming data D of fixed size N. $(L(H)_{min} = L(H0))$
  • We woukd like to design a code that for all D will compress D as close as possible to the code associated to H0 ∈ H that maximizes the likelihood of D $LL_D(G)_{max} | H0$
  • We call to this H0 ∈ H the best-fitting hypothesis. ## Too much explaination (ignore) ref: http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=72FFB108DFC6009541FF8E1A83E88308?doi=10.1.1.140.1665&rep=rep1&type=pdf -> implementation ## Final form of NML function: ## ${NML}(D|G) = LL_D(G) - C_N(B_G)$ where $B_G$ denote the set of all BNs with network structure G
  • There is no hope for computing $C_N(B_G)$ efficiently:
    • It involves an exponential sum over all possible data of size N.
    • It is not decomposable over the network structure. #### But from ref: proposed to approximate $C_N(B_G)$ by considering only the contribution to the parametric complexity of the multinomial distributions associated to each variable given a parent configuration: image info image info image info
In [19]:
class fNMLScore(BaseEstimator):
    def __init__(self, data, **kwargs):
        super(fNMLScore, self).__init__(data, **kwargs)
    
    def Szpankowski_approximation(self, row):
        if row.n == 0: return 1
        return 0.5*row.n*np.pi*np.exp(np.sqrt(8/(9*row.n*np.pi))+((3*np.pi-16)/(36*row.n*np.pi)))
    
    def Compute_regret(self, row, r):
        if row.n == 0: return 1
        return  row[f'C_{r-1}'] + (row.n/(r-2))*row[f'C_{r-2}']
        
    def fNML_score_single(self, variable, parents, is_print=False):
        var_states = self.state_names[variable]
        var_cardinality = len(var_states) # number of element in a (undefined set) axiomatic set theory
        state_counts = self.state_counts(variable, parents)
        num_parents_states = float(state_counts.shape[1])
        counts = np.asarray(state_counts)
        sample_size = len(self.data)
        
        if is_print:
            printmd(f'r_i = :{var_cardinality}')
            printmd(f'q_i = :{num_parents_states}')
            printmd('state_counts($N_{ijk}$):<br/>'+str(counts))
            printmd('sample_size($N$):'+str(sample_size))
            display(state_counts.head(2))
            print('-'*100)
            
        log_likelihoods = np.zeros_like(counts, dtype=np.float_)
        
        # Compute the log-counts
        np.log(counts, out=log_likelihoods, where=counts > 0)
        if is_print: 
            printmd("$log(N_{ijk})(>0)$ = <br/>"+f"{log_likelihoods}")
            print('-'*100)
        
        # Compute the log-conditional sample size
        log_conditionals = np.sum(counts, axis=0, dtype=np.float_)
        np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0)
        if is_print: 
            printmd("$log(N_{ij})(>0)$<br/>"+f"{log_conditionals}")
            print('-'*100)
        
        # Compute the log-likelihoods
        log_likelihoods -= log_conditionals
        log_likelihoods *= counts
        if is_print: 
            printmd("log_likelihoods = $N_{ijk} * log(N_{ijk}/N_{ij})$ = $N_{ijk}*[log(N_{ijk}) - log(N_{ij})]$ = ")
            print(f"{log_likelihoods}")
            print('-'*100)
        
        # Compute the 𝐶𝑁(𝐵𝐺) table
        N_ij = np.sum(counts, axis=0, dtype=np.float_)
        
        N_ij_unique, N_ij_cnt = np.unique(N_ij, return_counts=True)
        N_ij_dist = dict(zip(N_ij_unique, N_ij_cnt))
        
        C_ini = [[n, 1] for n in N_ij_unique]
        C_lookup_table = pd.DataFrame(C_ini, columns = ['n', 'C_1'])
        
        if var_cardinality > 1:
            C_lookup_table['C_2'] = C_lookup_table.apply(self.Szpankowski_approximation,axis=1)
        if var_cardinality > 2:
            for r_i in range(3,var_cardinality+1):
                Compute_regret_par = partial(self.Compute_regret, r=r_i)
                C_lookup_table[f'C_{r_i}'] = C_lookup_table.apply(Compute_regret_par,axis=1)
                if r_i > 5: #only obtain the last 3 regret C for memory saving
                    C_lookup_table = C_lookup_table.drop(columns=[f'C_{r_i-3}'])
        #C_lookup_table['N_ij_cnt'] = N_ij_cnt
        C_lookup_table['fNML_regret'] = np.log(C_lookup_table[f'C_{var_cardinality}'])
        
        fNML_regret = C_lookup_table.fNML_regret.values
        
        if is_print: 
            print(f"N_ij distribution:\n {N_ij_dist}")
            display(C_lookup_table)
            print("fNML_regret = \n"+f"{fNML_regret}")
            print('-'*100)
        
        score = np.sum(log_likelihoods)
        score -= np.sum(fNML_regret)
        if is_print: 
            printmd("fNML score = $\sum log\_likelihoods -  \sum logC^{r_i}_{N_{ij}}$")
            print('-'*100)
        return score
    
    def score(self, model,is_print=False):
        score = 0
        for node in model.nodes():
            if is_print:
                printmd(f"## node: {node}")
                printmd(f"### parent: {list(model.predecessors(node))}")
                printmd(f"score: {self.fNML_score_single(node, model.predecessors(node),is_print=is_print)}")
                print(f"="*100)
            score += self.fNML_score_single(node, model.predecessors(node))
        return score
In [20]:
# Importing data
simple_df = pd.read_csv('/etlstage/PEE_joint/NUS_modules/CS5340/group_data/small.csv')
display(simple_df.head())

# making a BN
BN_1 = BayesianModel([['fare','passengerclass'], ['age','passengerclass'],['portembarked','passengerclass'], ['passengerclass','survived']])
BN_1_G = nx.draw(BN_1, with_labels=True,node_size=3e3,width=3)
plt.show()

fNMLScore(simple_df).score(BN_1,is_print=True)
age portembarked fare numparentschildren passengerclass sex numsiblings survived
0 1 1 1 1 1 1 1 1
1 2 2 1 1 2 2 1 2
2 1 1 1 1 1 2 1 2
3 2 1 1 1 2 2 1 2
4 2 1 1 1 1 1 1 1

node: fare

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):
[[869] [ 17] [ 3]]

sample_size($N$):889

fare
1 869
2 17
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =
[[6.76734313] [2.83321334] [1.09861229]]

----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$
[6.79009724]

----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-19.77332181]
 [-67.26702615]
 [-17.07445484]]
----------------------------------------------------------------------------------------------------
N_ij distribution:
 {889.0: 1}
n C_1 C_2 C_3 fNML_regret
0 889.0 1 1421.48115 2310.48115 7.745211
fNML_regret = 
[7.74521107]
----------------------------------------------------------------------------------------------------

fNML score = $\sum log\_likelihoods - \sum logC^{r_i}_{N_{ij}}$

----------------------------------------------------------------------------------------------------

score: -111.86001387305932

====================================================================================================

node: passengerclass

parent: ['fare', 'age', 'portembarked']

r_i = :3

q_i = :27.0

statecounts($N{ijk}$):
[[166. 28. 14. 182. 38. 56. 5. 0. 2. 0. 0. 0. 0. 0.

0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]

[ 19. 17. 0. 84. 43. 2. 16. 13. 0. 4. 4. 0. 3. 5.

0.   1.   0.   0.   0.   0.   0.   0.   3.   0.   0.   0.   0.]

[ 59. 8. 0. 94. 9. 2. 11. 0. 1. 0. 0. 0. 0. 0.

0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]]

sample_size($N$):889

fare 1 2 3
age 1 2 3 1 ... 3 1 2 3
portembarked 1 2 3 1 2 3 1 2 3 1 ... 3 1 2 3 1 2 3 1 2 3
passengerclass
1 166.0 28.0 14.0 182.0 38.0 56.0 5.0 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 19.0 17.0 0.0 84.0 43.0 2.0 16.0 13.0 0.0 4.0 ... 0.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0

2 rows × 27 columns

----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =
[[5.11198779 3.33220451 2.63905733 5.20400669 3.63758616 4.02535169 1.60943791 0. 0.69314718 0. 0. 0.

          1. 0.
          1. 0.
      1. ] [2.94443898 2.83321334 0. 4.4308168 3.76120012 0.69314718 2.77258872 2.56494936 0. 1.38629436 1.38629436 0. 1.09861229 1.60943791 0. 0. 0. 0.
        1. 1.09861229 0.
      1. ] [4.07753744 2.07944154 0. 4.54329478 2.19722458 0.69314718 2.39789527 0. 0. 0. 0. 0.
          1. 0.
          1. 0.
      1. ]]
----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$
[5.49716823 3.97029191 2.63905733 5.88610403 4.49980967 4.09434456 3.4657359 2.56494936 1.09861229 1.38629436 1.38629436 0. 1.09861229 1.60943791 0. 0. 0. 0.

        1. 1.09861229 0.
      1. ]
----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[ -63.93995253  -17.86644729    0.         -124.14171668  -32.7644934
    -3.8636008    -9.28148995   -0.           -0.81093022   -0.
    -0.            0.           -0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.           -0.            0.            0.
     0.            0.        ]
 [ -48.50185568  -19.33033568   -0.         -122.24412754  -31.76021085
    -6.80239476  -11.09035489    0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.
     0.            0.            0.            0.            0.
     0.            0.        ]
 [ -83.7582161   -15.12680297   -0.         -126.22406942  -20.72326584
    -6.80239476  -11.74624693   -0.           -1.09861229   -0.
    -0.            0.           -0.           -0.            0.
     0.            0.            0.            0.            0.
     0.            0.           -0.            0.            0.
     0.            0.        ]]
----------------------------------------------------------------------------------------------------
N_ij distribution:
 {0.0: 12, 1.0: 1, 3.0: 3, 4.0: 2, 5.0: 1, 13.0: 1, 14.0: 1, 32.0: 1, 53.0: 1, 60.0: 1, 90.0: 1, 244.0: 1, 360.0: 1}
n C_1 C_2 C_3 fNML_regret
0 0.0 1 1.000000 1.000000 0.000000
1 1.0 1 2.522797 3.522797 1.259255
2 3.0 1 6.283466 9.283466 2.228235
3 4.0 1 8.079291 12.079291 2.491493
4 5.0 1 9.848078 14.848078 2.697870
5 13.0 1 23.560907 36.560907 3.598980
6 14.0 1 25.245546 39.245546 3.669838
7 32.0 1 55.121145 87.121145 4.467300
8 53.0 1 89.464592 142.464592 4.959093
9 60.0 1 100.849497 160.849497 5.080469
10 90.0 1 149.428200 239.428200 5.478254
11 244.0 1 396.456191 640.456191 6.462181
12 360.0 1 581.470389 941.470389 6.847443
fNML_regret = 
[0.         1.25925514 2.22823492 2.49149252 2.69787042 3.59897954
 3.66983797 4.46729962 4.95909349 5.08046913 5.47825358 6.46218072
 6.8474429 ]
----------------------------------------------------------------------------------------------------

fNML score = $\sum log\_likelihoods - \sum logC^{r_i}_{N_{ij}}$

----------------------------------------------------------------------------------------------------

score: -807.1179285366169

====================================================================================================

node: age

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):
[[319] [521] [ 49]]

sample_size($N$):889

age
1 319
2 521
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =
[[5.7651911 ] [6.25575004] [3.8918203 ]]

----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$
[6.79009724]

----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-326.94505634]
 [-278.39488795]
 [-142.01556993]]
----------------------------------------------------------------------------------------------------
N_ij distribution:
 {889.0: 1}
n C_1 C_2 C_3 fNML_regret
0 889.0 1 1421.48115 2310.48115 7.745211
fNML_regret = 
[7.74521107]
----------------------------------------------------------------------------------------------------

fNML score = $\sum log\_likelihoods - \sum logC^{r_i}_{N_{ij}}$

----------------------------------------------------------------------------------------------------

score: -755.1007252943548

====================================================================================================

node: portembarked

parent: []

r_i = :3

q_i = :1.0

statecounts($N{ijk}$):
[[644] [168] [ 77]]

sample_size($N$):889

portembarked
1 644
2 168
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =
[[6.46769873] [5.12396398] [4.34380542]]

----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$
[6.79009724]

----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-207.62464006]
 [-279.91038703]
 [-188.36446965]]
----------------------------------------------------------------------------------------------------
N_ij distribution:
 {889.0: 1}
n C_1 C_2 C_3 fNML_regret
0 889.0 1 1421.48115 2310.48115 7.745211
fNML_regret = 
[7.74521107]
----------------------------------------------------------------------------------------------------

fNML score = $\sum log\_likelihoods - \sum logC^{r_i}_{N_{ij}}$

----------------------------------------------------------------------------------------------------

score: -683.64470780996

====================================================================================================

node: survived

parent: ['passengerclass']

r_i = :2

q_i = :3.0

statecounts($N{ijk}$):
[[372 80 97] [119 134 87]]

sample_size($N$):889

passengerclass 1 2 3
survived
1 372 80 97
2 119 134 87
----------------------------------------------------------------------------------------------------

$log(N_{ijk})(>0)$ =
[[5.91889385 4.38202663 4.57471098] [4.77912349 4.8978398 4.46590812]]

----------------------------------------------------------------------------------------------------

$log(N_{ij})(>0)$
[6.19644413 5.36597602 5.21493576]

----------------------------------------------------------------------------------------------------

loglikelihoods = $N{ijk} log(N{ijk}/N{ij})$ = $N_{ijk}[log(N{ijk}) - log(N{ij})]$ =

[[-103.24870175  -78.71595043  -62.10180357]
 [-168.66115553  -62.73025282  -65.16540459]]
----------------------------------------------------------------------------------------------------
N_ij distribution:
 {184.0: 1, 214.0: 1, 491.0: 1}
n C_1 C_2 fNML_regret
0 184.0 1 300.490573 5.705416
1 214.0 1 348.503600 5.853649
2 491.0 1 789.905863 6.671914
fNML_regret = 
[5.70541638 5.85364856 6.67191378]
----------------------------------------------------------------------------------------------------

fNML score = $\sum log\_likelihoods - \sum logC^{r_i}_{N_{ij}}$

----------------------------------------------------------------------------------------------------

score: -558.8542474068927

====================================================================================================
Out[20]:
-2916.5776229208836
In [21]:
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'], 
                        ['high_price','volume'], 
                        ['low_price','volume'],
                        ['open_price','high_price'],
                        ['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()

fNMLScore(GME_stock_df).score(BN_GMS,is_print=False)
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
Out[21]:
-34736.5218691836

Comparasion

iteration over Sample size

In [22]:
display(GME_stock_df.shape)
display(GME_stock_df.head(5))
# making a BN
BN_GMS = BayesianModel([['open_price','adjclose_price'], 
                        ['high_price','volume'], 
                        ['low_price','volume'],
                        ['open_price','high_price'],
                        ['open_price','low_price'],])
nx.draw(BN_GMS, with_labels=True,node_size=3e3,width=2)
plt.show()
(2535, 6)
open_price high_price low_price close_price volume adjclose_price
date
2021-01-28 265.0 483.0 112.2 193.6 58815800.0 193.6
2021-01-27 355.0 380.0 249.0 348.0 93396700.0 348.0
2021-01-26 89.0 150.0 80.0 148.0 178588000.0 148.0
2021-01-25 97.0 159.0 61.0 77.0 177874000.0 77.0
2021-01-22 43.0 77.0 42.0 65.0 196784300.0 65.0
In [23]:
sample_size_ls = [n for n in np.arange(10, 100, 25)] + [n for n in np.arange(100, len(GME_stock_df), 100)]
print(sample_size_ls)
score_rank = {'sample_size': sample_size_ls,
              'K2Score':[],
              'BDeuScore':[],
              'BicScore':[],
              'AicScore':[],
              'fNMLScore':[],
             }

for sample_size in sample_size_ls:
    sample_data = GME_stock_df.sample(n=sample_size, random_state=1)
    
    score_rank['K2Score'].append(K2Score(sample_data).score(BN_GMS,is_print=False))
    score_rank['BDeuScore'].append(BDeuScore(sample_data, 
                                             equivalent_sample_size=sample_size/10).score(BN_GMS,is_print=False))
    score_rank['BicScore'].append(BicScore(sample_data).score(BN_GMS,is_print=False))
    score_rank['AicScore'].append(AicScore(sample_data).score(BN_GMS,is_print=False))
    score_rank['fNMLScore'].append(fNMLScore(sample_data).score(BN_GMS,is_print=False))
[10, 35, 60, 85, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500]
In [24]:
score_rank_df = pd.DataFrame(score_rank)
display(score_rank_df.head())
df_melt = score_rank_df.melt(id_vars='sample_size', value_vars=score_rank_df.columns.to_list()[1:])
df_melt['log_score'] = np.log(-df_melt.value)
fig = px.line(df_melt, x='sample_size' , y='log_score', color='variable')
fig.show()
sample_size K2Score BDeuScore BicScore AicScore fNMLScore
0 10 -110.734565 -123.862037 -1007.615980 -878.412145 -56.486788
1 35 -588.015567 -644.470570 -44858.774051 -25294.050785 -221.803120
2 60 -1056.919507 -1128.266503 -130249.237097 -63770.552003 -468.555797
3 85 -1547.604553 -1621.479937 -271745.415389 -122578.809783 -696.467425
4 100 -1836.540582 -1924.743194 -375100.402336 -163215.398184 -849.815538
In [25]:
sample_size_ls = [n for n in np.arange(10, 100, 25)] + [n for n in np.arange(100, len(GME_stock_df), 100)]
print(sample_size_ls)
score_rank = {'sample_size': sample_size_ls,
              'K2Score':[],
              'BDeuScore':[],
              'BicScore':[],
              'AicScore':[],
              'fNMLScore':[],
             }
c=1
for sample_size in sample_size_ls:
    sample_data = GME_stock_df.sample(n=sample_size, random_state=1)
    
    score_rank['K2Score'].append(K2Score(sample_data).score(BN_GMS,is_print=False))
    
    if sample_size < 200: 
        r = sample_size/5
    else:
        c+=1
        r = sample_size/(50)*c
    score_rank['BDeuScore'].append(BDeuScore(sample_data, 
                                             equivalent_sample_size=r).score(BN_GMS,is_print=False))
    score_rank['BicScore'].append(BicScore(sample_data).score(BN_GMS,is_print=False))
    score_rank['AicScore'].append(AicScore(sample_data).score(BN_GMS,is_print=False))
    score_rank['fNMLScore'].append(fNMLScore(sample_data).score(BN_GMS,is_print=False))
[10, 35, 60, 85, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500]
In [26]:
score_rank_df = pd.DataFrame(score_rank)
display(score_rank_df.head())
df_melt = score_rank_df.melt(id_vars='sample_size', value_vars=score_rank_df.columns.to_list()[1:])
df_melt['log_score'] = np.log(-df_melt.value)
fig = px.line(df_melt, x='sample_size' , y='log_score', color='variable')
fig.show()
sample_size K2Score BDeuScore BicScore AicScore fNMLScore
0 10 -110.734565 -118.813812 -1007.615980 -878.412145 -56.486788
1 35 -588.015567 -623.294678 -44858.774051 -25294.050785 -221.803120
2 60 -1056.919507 -1089.785188 -130249.237097 -63770.552003 -468.555797
3 85 -1547.604553 -1568.541795 -271745.415389 -122578.809783 -696.467425
4 100 -1836.540582 -1862.076937 -375100.402336 -163215.398184 -849.815538
In [ ]: